{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from IPython.core.interactiveshell import InteractiveShell\n",
    "InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'\n",
    "\n",
    "%load_ext autoreload\n",
    "%autoreload 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "sys.path.append('/home/mink/notebooks/CameraTraps')  # append this repo to PYTHONPATH\n",
    "sys.path.append('/home/mink/lib/ai4eutils')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import os\n",
    "from collections import Counter, defaultdict\n",
    "from random import sample\n",
    "from shutil import copyfile\n",
    "from multiprocessing.pool import ThreadPool\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm\n",
    "\n",
    "import path_utils  # ai4eutils\n",
    "\n",
    "from data_management.megadb.schema import sequences_schema_check\n",
    "from data_management.megadb.converters.cct_to_megadb import process_sequences"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# umn_gomez"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_name = 'umn_gomez'\n",
    "\n",
    "container_root = '/mink_disk_0/camtraps/umn-gomez/'  \n",
    "path_prefix = 'raw-pictures-habfrag/'\n",
    "\n",
    "path_to_output = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}.json' \n",
    "path_to_output_temp = f'/home/mink/camtraps/data/megadb_jsons/{dataset_name}_temp.json' "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 0 - Add an entry to the `datasets` table\n",
    "\n",
    "Done"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 1 - Prepare the `sequence` objects to insert into the database\n",
    "\n",
    "Species in a CSV but unfortunately no sequence info. Location in file name. They compared MD result and manually labeled classes (species and empty/human/animal).\n",
    "\n",
    "Not all images in the folder in blob storage have entries in the CSV."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "csv_path = os.path.join(container_root, 'row-pictures-habfrag_all_images_md_vs_manual_label.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv(csv_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "75557"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>filename</th>\n",
       "      <th>timestamp</th>\n",
       "      <th>common_name_wi</th>\n",
       "      <th>megaD_max_confidence</th>\n",
       "      <th>md_class</th>\n",
       "      <th>true_class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>66338</th>\n",
       "      <td>N27/09060810.JPG</td>\n",
       "      <td>2019-09-06 06:33:13</td>\n",
       "      <td>South American Coati</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Animal</td>\n",
       "      <td>Animal</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41449</th>\n",
       "      <td>N14/11230318.JPG</td>\n",
       "      <td>2019-11-23 13:52:28</td>\n",
       "      <td>Collared Peccary</td>\n",
       "      <td>1.0</td>\n",
       "      <td>Animal</td>\n",
       "      <td>Animal</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               filename            timestamp        common_name_wi  \\\n",
       "66338  N27/09060810.JPG  2019-09-06 06:33:13  South American Coati   \n",
       "41449  N14/11230318.JPG  2019-11-23 13:52:28      Collared Peccary   \n",
       "\n",
       "       megaD_max_confidence md_class true_class  \n",
       "66338                   1.0   Animal     Animal  \n",
       "41449                   1.0   Animal     Animal  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)\n",
    "df.sample(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Animal', 'Blank', 'Human'], dtype=object)"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['true_class'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['Rodent', \"Spix's Guan\", 'Blank', 'Bird', 'Unknown species',\n",
       "       'Black Agouti', 'Tayra', 'Southern Tamandua', 'Dasypus Species',\n",
       "       'Human', \"Salvin's Curassow\", 'Nine-banded Armadillo',\n",
       "       'Collared Peccary', 'Ocelot', 'Giant Anteater', 'Cervidae Family',\n",
       "       'Bos Species', 'Bush Dog', 'South American Coati', 'Lowland Tapir',\n",
       "       'Amazonian Motmot', 'Possum Family', 'Peccary Family',\n",
       "       'Caprimulgidae Family', 'Giant Armadillo',\n",
       "       'Northern Amazon Red Squirrel', 'White-tailed Deer',\n",
       "       'Spotted Paca', 'Red Brocket', 'Mazama Species', 'Mammal',\n",
       "       'Lizards and Snakes', 'White-lipped Peccary',\n",
       "       'Columbiformes Order', 'Aphelocoma Species', 'No CV Result',\n",
       "       'Giant Ameiva', 'Sciuridae Family', 'Armadillo Family',\n",
       "       'Turtle Order', 'Dasyprocta Species', 'Tamandua Species', 'Puma',\n",
       "       'Tortoise Family', 'Margarita Island Capuchin', 'Weasel Family',\n",
       "       'Jaguar', 'Saimiri Species', 'Coendou Species',\n",
       "       'Razor-billed Curassow', 'White-browed Guan', 'Primate',\n",
       "       'Didelphimorphia Order', 'Capybara', 'Sunbittern',\n",
       "       'Neotropical Otter', 'Dasyproctidae Family'], dtype=object)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df['common_name_wi'].unique()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "111878"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "111829"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "folder = os.path.join(container_root, path_prefix)\n",
    "\n",
    "paths = path_utils.recursive_file_list(folder)\n",
    "len(paths)\n",
    "paths = sorted([p.split(folder)[1] for p in paths if path_utils.is_image_file(p) and not os.path.basename(p).startswith('.')])\n",
    "len(paths)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'N36/11210994.JPG'"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "paths[-100]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "75557it [00:09, 8005.09it/s]\n"
     ]
    }
   ],
   "source": [
    "locations = set()\n",
    "sequences = []\n",
    "\n",
    "for i_row, row in tqdm(df.iterrows()):\n",
    "\n",
    "    fn = row['filename']\n",
    "    species = row['common_name_wi'].lower()\n",
    "    if species == 'blank':\n",
    "        species = 'empty'\n",
    "    \n",
    "    location = fn.split('/')[0]\n",
    "    locations.add(location)\n",
    "    \n",
    "    seq_id = fn.split('.')[0].replace('/', '_')\n",
    "        \n",
    "    sequences.append({\n",
    "        'dataset': dataset_name,\n",
    "        'seq_id': f'dummy_{seq_id}',\n",
    "        'images': [\n",
    "            {\n",
    "                'file': fn,\n",
    "                'frame_num': 1, # only one image, but easier for ingesting the annotations\n",
    "                'datetime': row['timestamp'], # extra field,\n",
    "                'mdv4_class': row['md_class'],\n",
    "                'mdv4_conf': row['megaD_max_confidence'],\n",
    "                'true_obj_class': row['true_class']\n",
    "            }\n",
    "        ],\n",
    "        'location': location,\n",
    "        'class': [species]\n",
    "    })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "13132"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_empty = sum([1 for seq in sequences if seq['class'][0] == 'empty'])\n",
    "num_empty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.17380255965694774"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "num_empty / 75557"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "44"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(locations)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "75557"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "[{'dataset': 'umn_gomez',\n",
       "  'seq_id': 'dummy_M05_08070432',\n",
       "  'images': [{'file': 'M05/08070432.JPG',\n",
       "    'frame_num': 1,\n",
       "    'datetime': '2019-08-07 15:58:52',\n",
       "    'mdv4_class': 'Animal',\n",
       "    'mdv4_conf': 1.0,\n",
       "    'true_obj_class': 'Animal'}],\n",
       "  'location': 'M05',\n",
       "  'class': ['collared peccary']},\n",
       " {'dataset': 'umn_gomez',\n",
       "  'seq_id': 'dummy_M05_08070433',\n",
       "  'images': [{'file': 'M05/08070433.JPG',\n",
       "    'frame_num': 1,\n",
       "    'datetime': '2019-08-07 15:58:53',\n",
       "    'mdv4_class': 'Animal',\n",
       "    'mdv4_conf': 1.0,\n",
       "    'true_obj_class': 'Animal'}],\n",
       "  'location': 'M05',\n",
       "  'class': ['collared peccary']},\n",
       " {'dataset': 'umn_gomez',\n",
       "  'seq_id': 'dummy_M05_08070434',\n",
       "  'images': [{'file': 'M05/08070434.JPG',\n",
       "    'frame_num': 1,\n",
       "    'datetime': '2019-08-07 15:58:54',\n",
       "    'mdv4_class': 'Animal',\n",
       "    'mdv4_conf': 1.0,\n",
       "    'true_obj_class': 'Animal'}],\n",
       "  'location': 'M05',\n",
       "  'class': ['collared peccary']}]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(sequences)\n",
    "sequences[20000:20003]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Step 2 - Pass the schema check\n",
    "\n",
    "Once your metadata are in the MegaDB format for `sequence` items, we check that they conform to the format's schema.\n",
    "\n",
    "If the format conforms, the following messages will be printed:\n",
    "\n",
    "```\n",
    "Verified that the sequence items meet requirements not captured by the schema.\n",
    "Verified that the sequence items conform to the schema.\n",
    "```\n",
    "\n",
    "For large datasets, the second step will take some time (~ a minute). \n",
    "\n",
    "Otherwise there will be an error message describing what's wrong. Please fix the issues until all checks are passed. You might need to write some snippets of code to loop through the `sequence` items to understand which entries have problems."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Verified that the sequence items meet requirements not captured by the schema.\n",
      "Verified that the sequence items conform to the schema.\n",
      "CPU times: user 9.64 s, sys: 15.6 ms, total: 9.66 s\n",
      "Wall time: 9.66 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "sequences_schema_check.sequences_schema_check(sequences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path_to_output_temp, 'w', encoding='utf-8') as f:\n",
    "    json.dump(sequences, f, indent=1, ensure_ascii=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Step 2b - copy non-empties to flat folder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def copy_file(src_path, dst_path):\n",
    "    return copyfile(src_path, dst_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 75557/75557 [00:05<00:00, 12818.89it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 675 ms, sys: 259 ms, total: 934 ms\n",
      "Wall time: 5.9 s\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "path_pairs = []\n",
    "for seq in tqdm(sequences):\n",
    "    \n",
    "    if seq['class'][0] == 'empty':\n",
    "        continue\n",
    "    \n",
    "    seq_id = seq['seq_id']\n",
    "    \n",
    "    im = seq['images'][0]\n",
    "    frame = im['frame_num']\n",
    "    \n",
    "    src_path = os.path.join(container_root, path_prefix, im['file'])\n",
    "    assert os.path.exists(src_path), src_path\n",
    "\n",
    "    dst_path = os.path.join('/mink_disk_0/camtraps/imerit12d', \n",
    "                            f'{dataset_name}.seq{seq_id}.frame{frame}.jpg')\n",
    "    path_pairs.append((src_path, dst_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "62425"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "text/plain": [
       "('/mink_disk_0/camtraps/umn-gomez/raw-pictures-habfrag/M00/10090027.JPG',\n",
       " '/mink_disk_0/camtraps/imerit12d/umn_gomez.seqdummy_M00_10090027.frame1.jpg')"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(path_pairs)  # non-empty images out of total of 19221 (38%)\n",
    "path_pairs[3000]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CPU times: user 35.4 s, sys: 1min 40s, total: 2min 15s\n",
      "Wall time: 5min 49s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "\n",
    "with ThreadPool(8) as pool:\n",
    "    dst_paths = pool.starmap(copy_file, path_pairs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "62425"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dst_paths)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python [conda env:cameratraps] *",
   "language": "python",
   "name": "conda-env-cameratraps-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
